# importing relevant modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None
# importing data as csv, assigning to DataFrame
# data source: https://figshare.com/articles/dataset/TetraDENSITY_Population_Density_dataset/5371633?file=20334360
animals = pd.read_csv('TetraDENSITY_v.1.csv')
# rows and columns
animals.shape
(18246, 19)
# info on each column
animals.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18246 entries, 0 to 18245 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Class 18246 non-null object 1 Order 18246 non-null object 2 Family 18246 non-null object 3 Genus 18246 non-null object 4 Species 18246 non-null object 5 Subspecies 827 non-null object 6 Longitude 18245 non-null float64 7 Latitude 18245 non-null float64 8 Locality 15092 non-null object 9 Country 18246 non-null object 10 Year 17127 non-null object 11 Season/Month 9911 non-null object 12 Habitat 8856 non-null object 13 Sampling_Area 11085 non-null float64 14 Sampling_Area_unit 11085 non-null object 15 Density 18246 non-null float64 16 Density_unit 18246 non-null object 17 Sampling_Method 15454 non-null object 18 Method_info 9619 non-null object dtypes: float64(4), object(15) memory usage: 2.6+ MB
# first 5 rows
animals.head()
| Class | Order | Family | Genus | Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | Sampling_Method | Method_info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Amphibia | Anura | Brachycephalidae | Brachycephalus | didactylus | NaN | -44.200000 | -23.183 | Vila Dois Rios (Ilha Grande) | Brazil | 1997 | January – May | Atlantic rainforest | 0.0064 | ha | 52.0 | ind/ha | Incomplete_counts | 24 plots 8x8m |
| 1 | Amphibia | Anura | Brachycephalidae | Brachycephalus | didactylus | NaN | -44.200000 | -23.183 | Vila Dois Rios (Ilha Grande) | Brazil | 1997 | January – May | Atlantic rainforest | 0.0002 | ha | 1778.0 | ind/ha | Incomplete_counts | 90 plots 2x1m and litter removal method |
| 2 | Amphibia | Anura | Brachycephalidae | Brachycephalus | didactylus | NaN | -42.583000 | -22.417 | Fazenda Santa Bárbara in the Parque Estadual d... | Brazil | 2006 | Late October - Early November | Atlantic rainforest | 0.0025 | ha | 400.0 | ind/ha | Incomplete_counts | 25 plots 5x5m |
| 3 | Amphibia | Anura | Brachycephalidae | Brachycephalus | hermogenesi | NaN | -48.266667 | -25.150 | Reserva Particular do Patrimônio Natural Salto... | Brazil | 2009-2010 | Summer | Rainforest | 0.2560 | ha | 16.0 | ind/ha | Incomplete_counts | Counts in plots |
| 4 | Amphibia | Anura | Brachycephalidae | Brachycephalus | hermogenesi | NaN | -48.266667 | -25.150 | Reserva Particular do Patrimônio Natural Salto... | Brazil | 2009-2010 | Autumn | Rainforest | 0.2560 | ha | 16.0 | ind/ha | Incomplete_counts | Counts in plots |
# dropping unnecessary columns
animals = animals.drop(['Sampling_Method', 'Method_info'], axis = 1)
# checking area units
animals['Sampling_Area_unit'].unique()
array(['ha', nan, 'km2'], dtype=object)
# defining function to convert area from hectares to km2
def ha_to_km2(ha):
return ha * 0.01
# converting areas from hectares to km2, and changing units from 'ha' to 'km2'
for ind in animals.index:
if animals['Sampling_Area_unit'][ind] == 'ha':
animals['Sampling_Area'][ind] = ha_to_km2(animals['Sampling_Area'][ind])
animals['Sampling_Area_unit'][ind] = 'km2'
animals['Sampling_Area_unit'].unique()
array(['km2', nan], dtype=object)
# checking density units
animals['Density_unit'].unique()
array(['ind/ha', 'males/ha', 'pairs/km2', 'ind/km2'], dtype=object)
# converting density from ind/ha to ind/km2
for ind in animals.index:
if animals['Density_unit'][ind] == 'ind/ha':
animals['Density'][ind] = ha_to_km2(animals['Density'][ind])
animals['Density_unit'][ind] = 'ind/km2'
# converting density from males/ha to males/km2
for ind in animals.index:
if animals['Density_unit'][ind] == 'males/ha':
animals['Density'][ind] = ha_to_km2(animals['Density'][ind])
animals['Density_unit'][ind] = 'males/km2'
animals['Density_unit'].unique()
array(['ind/km2', 'males/km2', 'pairs/km2'], dtype=object)
# converting all to ind/km2 - assuming equal numbers of male and female animals
for ind in animals.index:
if animals['Density_unit'][ind] == 'males/km2':
animals['Density'][ind] = (animals['Density'][ind]) * 2
animals['Density_unit'][ind] = 'ind/km2'
for ind in animals.index:
if animals['Density_unit'][ind] == 'pairs/km2':
animals['Density'][ind] = (animals['Density'][ind]) * 2
animals['Density_unit'][ind] = 'ind/km2'
animals['Density_unit'].unique()
array(['ind/km2'], dtype=object)
# viewing random sample of 10 rows
animals.sample(10)
| Class | Order | Family | Genus | Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5958 | Aves | Passeriformes | Pomatostomidae | Pomatostomus | temporalis | NaN | 132.100000 | -14.13000 | Munmarlary | Australia | 1987 | May | Tropical forest (Unburnt) | 0.1000 | km2 | 18.000000 | ind/km2 |
| 12474 | Mammalia | Cetartiodactyla | Cervidae | Rusa | unicolor | NaN | 101.370000 | 14.44000 | NaN | Thailand | NaN | NaN | NaN | NaN | NaN | 13.000000 | ind/km2 |
| 6780 | Aves | Passeriformes | Sylviidae | Sylvia | atricapilla | NaN | 23.700000 | 52.70000 | Białowieża National Park | Poland | 2003 | year-round | oak-hornbeam-lime forest | 0.2400 | km2 | 112.500000 | ind/km2 |
| 11208 | Mammalia | Cetartiodactyla | Bovidae | Raphicerus | campestris | NaN | 20.600000 | -18.80000 | Kaudom GP | Namibia | 1988 | NaN | NaN | 3841.0000 | km2 | 0.007810 | ind/km2 |
| 17999 | Reptilia | Squamata | Scincidae | Emoia | atrocostata | NaN | 123.200000 | 9.40000 | Polo (Negros Island) | Philippines | 1964 | July-November 1964 | Mangrove forest | 0.0359 | km2 | 1.186630 | ind/km2 |
| 17836 | Reptilia | Squamata | Lacertidae | Zootoca | vivipara | NaN | 6.138000 | 51.54700 | de Hamert reserve | Netherlands | 1981 | NaN | NaN | 0.0120 | km2 | 0.940000 | ind/km2 |
| 9234 | Mammalia | Carnivora | Canidae | Cuon | alpinus | NaN | 80.560190 | 22.29183 | NaN | India | 1996 | NaN | NaN | 940.0000 | km2 | 0.323404 | ind/km2 |
| 758 | Aves | Anseriformes | Anatidae | Cairina | moschata | NaN | -62.466000 | -4.33300 | Terra Firme | Brazil | 2002-2003 | NaN | Upland Forest | 4.5000 | km2 | 1.150000 | ind/km2 |
| 14887 | Mammalia | Primates | Hominidae | Gorilla | gorilla | NaN | 14.583333 | 1.10000 | Mbomo | Congo | 1989-1990 | NaN | NaN | 1.9100 | km2 | 0.600000 | ind/km2 |
| 4573 | Aves | Passeriformes | Motacillidae | Anthus | trivialis | NaN | 23.700000 | 52.70000 | Białowieża National Park | Poland | 2000 | year-round | ash-alder forest | 0.3300 | km2 | 6.061000 | ind/km2 |
# setting figure size to default
plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
count_plot = sns.countplot(x = 'Class', data = animals)
plt.rcParams["figure.figsize"] = (10, 2)
country_count = animals['Country'].value_counts(sort = True).head(10)
country_count.plot(kind = 'bar', title = 'Top 10 countries by row count')
<AxesSubplot: title={'center': 'Top 10 countries by row count'}>
plt.rcParams["figure.figsize"] = (15, 5)
year_count = animals['Year'].value_counts(sort = True).head(80).sort_index()
year_count.plot(kind = 'bar', title = 'Row counts by year')
<AxesSubplot: title={'center': 'Row counts by year'}>
# top 10 families by row count
animals['Family'].value_counts(sort = True).head(10)
Bovidae 2124 Cercopithecidae 726 Sylviidae 564 Fringillidae 537 Paridae 523 Cervidae 478 Elephantidae 431 Muscicapidae 422 Cricetidae 418 Felidae 400 Name: Family, dtype: int64
animals[animals['Locality'] == 'Białowieża National Park'].groupby('Genus')['Genus'].value_counts().sort_values(ascending = False).head(10)
Genus Genus Parus Parus 159 Dendrocopos Dendrocopos 108 Ficedula Ficedula 86 Turdus Turdus 75 Phylloscopus Phylloscopus 70 Regulus Regulus 50 Columba Columba 44 Sylvia Sylvia 36 Prunella Prunella 35 Fringilla Fringilla 35 Name: Genus, dtype: int64
# creating new dataset containing Białowieża National Park 'paruses'
parus = animals[(animals['Locality'] == 'Białowieża National Park')&(animals['Genus'] == 'Parus')].drop(animals.iloc[:, 0:4], axis = 1)
parus.sample(5)
| Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5437 | cristatus | NaN | 23.7 | 52.7 | Białowieża National Park | Poland | 2004 | year-round | pine-bilberry | 0.250 | km2 | 32.000 | ind/km2 |
| 5532 | major | NaN | 23.7 | 52.7 | Białowieża National Park | Poland | 2001 | year-round | oak-hornbeam-lime forest | 0.300 | km2 | 126.667 | ind/km2 |
| 5555 | major | NaN | 23.7 | 52.7 | Białowieża National Park | Poland | 2004 | year-round | ash-alder forest | 0.330 | km2 | 81.818 | ind/km2 |
| 5354 | caeruleus | NaN | 23.7 | 52.7 | Białowieża National Park | Poland | 2003 | year-round | oak-hornbeam-lime forest | 0.255 | km2 | 68.627 | ind/km2 |
| 5248 | ater | NaN | 23.7 | 52.7 | Białowieża National Park | Poland | 2004 | year-round | oak-hornbeam-lime forest | 0.300 | km2 | 16.667 | ind/km2 |
# checking parus species
parus['Species'].unique()
array(['ater', 'caeruleus', 'cristatus', 'major', 'montanus', 'palustris'],
dtype=object)
# species densities in oak-hornbeam lime forest
ater_oak = parus[(parus['Species'] == 'ater')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
ater_oak.plot(kind = 'line', x = 'Year', y = 'Density', title = 'Parus species population densities in oak-hornbeam lime forest', legend = True, label = 'ater')
caeruleus_oak = parus[(parus['Species'] == 'caeruleus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
caeruleus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'caeruleus')
cristatus_oak = parus[(parus['Species'] == 'cristatus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
cristatus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'cristatus')
major_oak = parus[(parus['Species'] == 'major')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
major_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'major')
montanus_oak = parus[(parus['Species'] == 'montanus')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
montanus_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'montanus')
palustris_oak = parus[(parus['Species'] == 'palustris')&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
palustris_oak.plot(kind = 'line', x = 'Year', y = 'Density', legend = True, label = 'palustris')
<AxesSubplot: title={'center': 'Parus species population densities in oak-hornbeam lime forest'}, xlabel='Year'>
# using a for loop to do the same thing - much less code!
species = parus['Species'].unique()
for specie in species:
specie_data = parus[(parus['Species'] == specie)&(parus['Habitat'] == 'oak-hornbeam-lime forest')].groupby(['Year'])['Density'].mean()
specie_data.plot(kind = 'line', x = 'Year', y = 'Density', title = 'Parus species population densities in oak-hornbeam lime forest', legend = True, label = specie)
# creating new dataset containing only cats
cats = animals[animals['Family'] == 'Felidae']
sns.countplot(x = 'Genus', data = cats)
<AxesSubplot: xlabel='Genus', ylabel='count'>
plt.rcParams["figure.figsize"] = (10, 3)
# creating new dataset containing only big cats
big_cats = cats[cats['Genus'] == 'Panthera']
big_cats_count_plot = sns.countplot(x = 'Species', data = big_cats)
# viewing random sample of 10 rows
big_cats.sample(10)
| Class | Order | Family | Genus | Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9464 | Mammalia | Carnivora | Felidae | Panthera | pardus | NaN | 35.57418 | -3.17037 | NaN | Tanzania | 1988 | NaN | NaN | 260.0 | km2 | 0.076923 | ind/km2 |
| 9439 | Mammalia | Carnivora | Felidae | Panthera | onca | NaN | -53.70000 | -26.60000 | Green Corridor I | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.009100 | ind/km2 |
| 9659 | Mammalia | Carnivora | Felidae | Panthera | tigris | NaN | 99.17000 | 15.42000 | Huai Kha Khaeng Wildlife Sanctuary | Thailand | 2005 | NaN | NaN | 2780.0 | km2 | 0.018201 | ind/km2 |
| 9389 | Mammalia | Carnivora | Felidae | Panthera | leo | NaN | 35.57418 | -3.17037 | NaN | Tanzania | 1988 | NaN | NaN | 260.0 | km2 | 0.348740 | ind/km2 |
| 9414 | Mammalia | Carnivora | Felidae | Panthera | leo | NaN | 35.00000 | -1.00000 | Ol Kinyei | Kenya | 2014 | August-October | NaN | NaN | NaN | 0.225000 | ind/km2 |
| 9549 | Mammalia | Carnivora | Felidae | Panthera | pardus | NaN | 80.56019 | 22.29183 | NaN | India | 1998 | NaN | NaN | 110.0 | km2 | 0.090909 | ind/km2 |
| 9545 | Mammalia | Carnivora | Felidae | Panthera | pardus | NaN | 80.41137 | 23.61895 | NaN | India | 1998 | NaN | NaN | 449.0 | km2 | 0.060134 | ind/km2 |
| 9601 | Mammalia | Carnivora | Felidae | Panthera | tigris | NaN | 79.47063 | 21.61774 | NaN | India | 1993 | NaN | NaN | 758.0 | km2 | 0.036939 | ind/km2 |
| 9614 | Mammalia | Carnivora | Felidae | Panthera | tigris | NaN | 78.93506 | 29.53330 | NaN | India | 1995 | NaN | NaN | 1319.0 | km2 | 0.101592 | ind/km2 |
| 9477 | Mammalia | Carnivora | Felidae | Panthera | pardus | NaN | 76.43633 | 27.31565 | NaN | India | 1991 | NaN | NaN | 866.0 | km2 | 0.032333 | ind/km2 |
# removing unnecessary columns
big_cats = big_cats.drop(big_cats.iloc[:, 0:4], axis = 1)
big_cats.head(5)
| Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9368 | leo | NaN | 15.78200 | -19.01763 | NaN | Namibia | 1926 | NaN | NaN | 74000.0 | km2 | 0.002838 | ind/km2 |
| 9369 | leo | NaN | 36.11150 | -4.15202 | NaN | Tanzania | 1962 | NaN | NaN | 1683.0 | km2 | 0.008913 | ind/km2 |
| 9370 | leo | NaN | 36.11150 | -4.15202 | NaN | Tanzania | 1962 | NaN | NaN | 1683.0 | km2 | 0.035651 | ind/km2 |
| 9371 | leo | NaN | 31.58213 | -23.98913 | NaN | South Africa | 1964 | NaN | NaN | 18989.0 | km2 | 0.058982 | ind/km2 |
| 9372 | leo | NaN | 35.57418 | -3.17037 | NaN | Tanzania | 1965 | NaN | NaN | 260.0 | km2 | 0.115385 | ind/km2 |
# counts by year
plt.rcParams["figure.figsize"] = (15, 5)
year_count = big_cats['Year'].value_counts(sort = True).head(80).sort_index()
year_count.plot(kind = 'bar', title = 'Row counts by year')
<AxesSubplot: title={'center': 'Row counts by year'}>
# visualising using plotly
import plotly.express as px
fig = px.scatter_geo(big_cats, lat = 'Latitude', lon = 'Longitude', hover_name = "Year", color = 'Species')
fig.update_layout(title = 'Big Cat Population Locations', title_x = 0.5)
fig.show()
# Thailand tiger population density over time
fig, ax = plt.subplots(figsize = (7, 3))
thai_tigers = big_cats[(big_cats['Species'] == 'tigris')&(big_cats['Locality'] == 'Huai Kha Khaeng Wildlife Sanctuary')]
thai_tigers.plot(ax = ax, kind = 'line', x = 'Year', y = 'Density', title = 'Tiger Population Density - Huai Kha Khaeng Wildlife Sanctuary', legend = [])
<AxesSubplot: title={'center': 'Tiger Population Density - Huai Kha Khaeng Wildlife Sanctuary'}, xlabel='Year'>
# focussing on jaguars, 2003-14
jaguars_0314 = big_cats[(big_cats['Species'] == 'onca') & (big_cats['Year'] == '2003-2014')]
jaguars_0314
| Species | Subspecies | Longitude | Latitude | Locality | Country | Year | Season/Month | Habitat | Sampling_Area | Sampling_Area_unit | Density | Density_unit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9431 | onca | NaN | -53.8 | -22.30 | Ivinhema | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0166 | ind/km2 |
| 9432 | onca | NaN | -40.3 | -19.20 | Vale NR I | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0242 | ind/km2 |
| 9433 | onca | NaN | -53.7 | -26.60 | Green Corridor II | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0107 | ind/km2 |
| 9434 | onca | NaN | -54.5 | -25.60 | Iguazú-San Jorge | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0120 | ind/km2 |
| 9435 | onca | NaN | -54.2 | -25.85 | Iguazú-Urugua-í | Argentina | 2003-2014 | NaN | NaN | NaN | NaN | 0.0089 | ind/km2 |
| 9436 | onca | NaN | -52.3 | -22.50 | Morro do Diabo | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0239 | ind/km2 |
| 9437 | onca | NaN | -48.2 | -24.60 | Intervales-PETAR | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0066 | ind/km2 |
| 9438 | onca | NaN | -54.8 | -25.00 | Mbaracayú | Paraguay | 2003-2014 | NaN | NaN | NaN | NaN | 0.0129 | ind/km2 |
| 9439 | onca | NaN | -53.7 | -26.60 | Green Corridor I | Brazil | 2003-2014 | NaN | NaN | NaN | NaN | 0.0091 | ind/km2 |
# visualising using plotly
fig = px.scatter_geo(jaguars_0314, lat = 'Latitude', lon = 'Longitude', hover_name = "Locality", color = 'Density',color_continuous_scale = ['green', 'red'])
fig.update_layout(title = 'Jaguar Population Density 2003-14', title_x = 0.5)
fig.show()
# visualising using geopandas
import geopandas as gpd
import descartes
from shapely.geometry import Point, Polygon
# had to download a shape file (.shp) of South America
jaguar_map = gpd.read_file('C:/Users/sypak/Downloads/data (1).zip')
# set coordinate reference system
crs = {'init':'epsg:4326'}
# convert longitude and latitude to coordinates
geometry = [Point(xy) for xy in zip(jaguars_0314['Longitude'], jaguars_0314['Latitude'])]
# store a new version of jaguars_0314 as a GeoDataFrame
geo_df = gpd.GeoDataFrame(jaguars_0314, crs = crs, geometry = geometry)
C:\Users\sypak\AppData\Local\Programs\Python\Python310\lib\site-packages\pyproj\crs\crs.py:141: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
fig, ax = plt.subplots(figsize = (5, 3))
jaguar_map.boundary.plot(ax = ax, color = 'gray')
geo_df.plot(ax = ax, column = 'Density', alpha = 0.9, legend = True, legend_kwds = {'label': "Population Density"}, cmap = 'OrRd')
ax.set_axis_off()
ax.set_title('Jaguar Population Density 2003-14')
Text(0.5, 1.0, 'Jaguar Population Density 2003-14')